{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1ecc443f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "22/11/25 12:37:42 WARN Utils: Your hostname, Dobbins-MBP.local resolves to a loopback address: 127.0.0.1; using 192.168.31.130 instead (on interface en0)\n",
      "22/11/25 12:37:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
      "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
      "22/11/25 12:37:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
     ]
    }
   ],
   "source": [
    "# 如果不存在，执行以下命令安装：pip install pyspark\n",
    "from pyspark.sql import SparkSession\n",
    "spark = SparkSession\\\n",
    "        .builder\\\n",
    "        .appName(\"test_analyst_parquet_file_20221116\")\\\n",
    "        .getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9efef24e",
   "metadata": {},
   "outputs": [],
   "source": [
    "inpath='/Users/msxr/develop/tmp/2022-11-2500fb559d-3311-4263-859c-8f2e93ff5a31.parquet'\n",
    "outpath='/Users/msxr/develop/tmp/test_analyst_parquet_file_20221116/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "02ba4a8b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "df=spark.read.parquet(inpath)\n",
    "df.createOrReplaceTempView('t_test')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "96eefcca",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------+--------------------+-----+-------------------+-------------+-------------------------------+\n",
      "|      bucket|                 key| size| last_modified_date|storage_class|intelligent_tiering_access_tier|\n",
      "+------------+--------------------+-----+-------------------+-------------+-------------------------------+\n",
      "|ads-algo-sg1|data_warehouse/od...|30642|2022-11-12 23:00:02|     STANDARD|                           null|\n",
      "+------------+--------------------+-----+-------------------+-------------+-------------------------------+\n",
      "only showing top 1 row\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "[Stage 2:=======================================>                   (2 + 1) / 3]\r",
      "\r",
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "df.show(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "24db458a",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(\"\"\"\n",
    "select app_version_name,length(app_version_name) len\n",
    "from t_test group by 1,2\n",
    "order by 2 desc\n",
    "\"\"\").write.csv(outpath)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "68a67bd4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------------------------+-------------------+\n",
      "|REGEXP(app_version_name, ^[a-zA-Z0-9])|   app_version_name|\n",
      "+--------------------------------------+-------------------+\n",
      "|                                 false|￾㄀　⸀㄀⸀㤀㠀开眀眀|\n",
      "|                                  true|              right|\n",
      "+--------------------------------------+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "spark.sql(\"\"\"\n",
    "select regexp(app_version_name,'^[a-zA-Z0-9]')\n",
    "   , case when regexp(app_version_name,'^[a-zA-Z0-9]') then 'right' \n",
    "   else app_version_name end app_version_name\n",
    "from t_test \n",
    "where length(app_version_name)=11\n",
    "group by 1,2\n",
    "order by 2 desc\n",
    "\"\"\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "01bb5d32",
   "metadata": {},
   "outputs": [
    {
     "ename": "AnalysisException",
     "evalue": "Undefined function: 'format_datetime'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 2 pos 7",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAnalysisException\u001b[0m                         Traceback (most recent call last)",
      "\u001b[0;32m/var/folders/mj/gzzjyp5s3j5fq8jywxpy4dyc0000gn/T/ipykernel_93748/4012029879.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m spark.sql(\"\"\"\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mselect\u001b[0m \u001b[0mformat_datetime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mFROM_UNIXTIME\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1668654134000\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m1000\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;36m28800\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'HH'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \"\"\").show()\n",
      "\u001b[0;32m/opt/anaconda3/lib/python3.9/site-packages/pyspark/sql/session.py\u001b[0m in \u001b[0;36msql\u001b[0;34m(self, sqlQuery)\u001b[0m\n\u001b[1;32m    721\u001b[0m         \u001b[0;34m[\u001b[0m\u001b[0mRow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'row1'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'row2'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mRow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf1\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf2\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'row3'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    722\u001b[0m         \"\"\"\n\u001b[0;32m--> 723\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jsparkSession\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msqlQuery\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_wrapped\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    724\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    725\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mtable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtableName\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/anaconda3/lib/python3.9/site-packages/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m   1307\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1308\u001b[0m         \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1309\u001b[0;31m         return_value = get_return_value(\n\u001b[0m\u001b[1;32m   1310\u001b[0m             answer, self.gateway_client, self.target_id, self.name)\n\u001b[1;32m   1311\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/anaconda3/lib/python3.9/site-packages/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m    115\u001b[0m                 \u001b[0;31m# Hide where the exception came from that shows a non-Pythonic\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    116\u001b[0m                 \u001b[0;31m# JVM exception message.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 117\u001b[0;31m                 \u001b[0;32mraise\u001b[0m \u001b[0mconverted\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    118\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    119\u001b[0m                 \u001b[0;32mraise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAnalysisException\u001b[0m: Undefined function: 'format_datetime'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 2 pos 7"
     ]
    }
   ],
   "source": [
    "spark.sql(\"\"\"\n",
    "select format_datetime(FROM_UNIXTIME(1668654134000/1000+28800),'HH')\n",
    "\"\"\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d34fc5d9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
